package org.apache.lucene.demo;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.*;
import org.apache.lucene.document.*;
import org.apache.lucene.demo.html.HTMLParser;

/** A utility for making Lucene Documents for HTML documents. */

public class HTMLDocument {
    static char dirSep = System.getProperty("file.separator").charAt(0);

    public static String uid(File f) {
        // Append path and date into a string in such a way that lexicographic
        // sorting gives the same results as a walk of the file hierarchy.  Thus
        // null (\u0000) is used both to separate directory components and to
        // separate the path from the date.
        return f.getPath().replace(dirSep, '\u0000') + "\u0000" + DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND);
    }

    public static String uid2url(String uid) {
        String url = uid.replace('\u0000', '/'); // replace nulls with slashes
        return url.substring(0, url.lastIndexOf('/')); // remove date from end
    }

    public static Document Document(File f) throws IOException, InterruptedException {
        // make a new, empty document
        Document doc = new Document();

        // Add the url as a field named "path".  Use a field that is 
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", f.getPath().replace(dirSep, '/'), Field.Store.YES, Field.Index.UN_TOKENIZED));

        // Add the last modified date of the file a field named "modified".  
        // Use a field that is indexed (i.e. searchable), but don't tokenize
        // the field into words.
        doc.add(new Field("modified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.UN_TOKENIZED));

        // Add the uid as a field, so that index can be incrementally maintained.
        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));

        FileInputStream fis = new FileInputStream(f);
        HTMLParser parser = new HTMLParser(fis);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        doc.add(new Field("contents", parser.getReader()));

        // Add the summary as a field that is stored and returned with
        // hit documents for display.
        doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));

        // Add the title as a field that it can be searched and that is stored.
        doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));

        // return the document
        return doc;
    }

    private HTMLDocument() {
    }
}
